Skip to content

Conversation

@KavinTheG
Copy link
Contributor

@KavinTheG KavinTheG commented Nov 6, 2025

Resolves #165694

@llvmbot
Copy link
Member

llvmbot commented Nov 6, 2025

@llvm/pr-subscribers-backend-x86

Author: Kavin Gnanapandithan (KavinTheG)

Changes

Resolve issue #165694.


Patch is 49.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166839.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+73)
  • (modified) llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll (+163-108)
  • (modified) llvm/test/CodeGen/X86/ldexp-avx512.ll (+129-384)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406bd8e0d7..f9e9bb26638d4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2590,6 +2590,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
   }
 
+  if (Subtarget.hasAVX512()) {
+    for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64})
+      setOperationAction(ISD::FLDEXP, VT, Custom);
+    
+    if (Subtarget.hasVLX()) {
+      for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 }) 
+        setOperationAction(ISD::FLDEXP, VT, Custom);
+
+      if (Subtarget.hasFP16()) {
+        for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 })
+          setOperationAction(ISD::FLDEXP, VT, Custom);
+      }
+    }
+    
+    if (Subtarget.hasFP16()) {
+      for (MVT VT : { MVT::f16, MVT::v32f16 })
+        setOperationAction(ISD::FLDEXP, VT, Custom);
+    }
+  }
+
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
   // is. We should promote the value to 64-bits to solve this.
   // This is what the CRT headers do - `fmodf` is an inline header
@@ -19142,6 +19162,58 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   return SDValue();
 }
 
+static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  MVT XTy = X.getSimpleValueType();
+  SDValue Exp = Op.getOperand(1);
+  MVT XVT, ExpVT; 
+
+  switch (XTy.SimpleTy) { 
+    default:
+      return SDValue();
+    case MVT::f16:
+      if (Subtarget.hasFP16()) {
+        XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16;
+        ExpVT = XVT;
+        break;
+      } 
+      X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
+      [[fallthrough]];
+    case MVT::f32:
+      XVT = MVT::v4f32;
+      ExpVT = MVT::v4f32;
+      break;
+    case MVT::f64:
+      XVT = MVT::v2f64;
+      ExpVT = MVT::v2f64;
+      break;
+    case MVT::v4f32:
+    case MVT::v2f64:
+    case MVT::v8f32:
+    case MVT::v4f64:
+    case MVT::v16f32:
+    case MVT::v8f64:
+      Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); 
+      return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
+  }
+
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
+  SDValue VX =
+      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
+  SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
+                             DAG.getUNDEF(ExpVT), Exp, Zero);
+  SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX);
+  SDValue Final =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero);
+  if (X.getValueType() != XTy)
+    Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final,
+                        DAG.getIntPtrConstant(1, SDLoc(Op)));
+  return Final;
+}
+
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -33672,6 +33744,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
   case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
+  case ISD::FLDEXP:             return LowerFLDEXP(Op, Subtarget, DAG);
   // clang-format on
   }
 }
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 81529aff39ff1..499695f408396 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -79,38 +79,64 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
 ; CHECK-SSE-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-SSE-NEXT:    retq
 ;
-; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
-; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    subq $40, %rsp
-; CHECK-AVX-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX-NEXT:    vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT:    callq ldexpf@PLT
-; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT:    vmovd %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT:    callq ldexpf@PLT
-; CHECK-AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT:    vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT:    callq ldexpf@PLT
-; CHECK-AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT:    vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT:    callq ldexpf@PLT
-; CHECK-AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-AVX-NEXT:    addq $40, %rsp
-; CHECK-AVX-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-AVX-NEXT:    retq
+; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    subq $40, %rsp
+; CHECK-AVX2-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vextractps $1, %xmm0, %edi
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT:    callq ldexpf@PLT
+; CHECK-AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vmovd %xmm0, %edi
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT:    callq ldexpf@PLT
+; CHECK-AVX2-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vextractps $2, %xmm0, %edi
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT:    callq ldexpf@PLT
+; CHECK-AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vextractps $3, %xmm0, %edi
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT:    callq ldexpf@PLT
+; CHECK-AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-AVX2-NEXT:    addq $40, %rsp
+; CHECK-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-AVX2-NEXT:    retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %xmm0, %xmm1
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm2 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-ONLY-AVX512F-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-ONLY-AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-ONLY-AVX512F-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-SKX-NEXT:    vscalefps %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT:    retq
   %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
   ret <4 x float> %r
 }
@@ -560,82 +586,109 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
-; CHECK-AVX512F:       # %bb.0:
-; CHECK-AVX512F-NEXT:    subq $72, %rsp
-; CHECK-AVX512F-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vpextrw $7, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $6, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $5, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $4, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $3, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $2, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vpextrw $1, %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
-; CHECK-AVX512F-NEXT:    movswl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
-; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512F-NEXT:    addq $72, %rsp
-; CHECK-AVX512F-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-AVX512F-NEXT:    retq
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $7, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm2
+; CHECK-ONLY-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm2, %xmm1, %xmm2
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $6, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $5, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $4, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $3, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm3, %xmm1, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $2, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-ONLY-AVX512F-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm4, %xmm1, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-ONLY-AVX512F-NEXT:    vmovd %xmm0, %eax
+; CHECK-ONLY-AVX512F-NEXT:    cwtl
+; CHECK-ONLY-AVX512F-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vscalefss %xmm0, %xmm1, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-ONLY-AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-ONLY-AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-ONLY-AVX512F-NEXT:    retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpextrw $7, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm1
+; CHECK-SKX-NEXT:    vmovss {{.*#+}} xmm2 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-SKX-NEXT:    vscalefss %xmm1, %xmm2, %xmm1
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-SKX-NEXT:    vpextrw $6, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT:    vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; CHECK-SKX-NEXT:    vpextrw $5, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT:    vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT:    vpextrw $4, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT:    vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SKX-NEXT:    vpextrw $3, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm3
+; CHECK-SKX-NEXT:    vscalefss %xmm3, %xmm2, %xmm3
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; CHECK-SKX-NEXT:    vpextrw $2, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT:    vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; CHECK-SKX-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm4
+; CHECK-SKX-NEXT:    vscalefss %xmm4, %xmm2, %xmm4
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; CHECK-SKX-NEXT:    vmovd %xmm0, %eax
+; CHECK-SKX-NEXT:    cwtl
+; CHECK-SKX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
+; CHECK-SKX-NEXT:    vscalefss %xmm0, %xmm2, %xmm0
+; CHECK-SKX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-SKX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-SKX-NEXT:    retq
   %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
   ret <8 x half> %r
 }
@@ -1769,3 +1822,5 @@ define x86_fp80 @pr128528(i1 %cond) {
   %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
   ret x86_fp80 %mul
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below...
[truncated]

@github-actions
Copy link

github-actions bot commented Nov 6, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@RKSimon RKSimon requested review from RKSimon and phoebewang November 7, 2025 09:06
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice - a few minors

SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
SDValue VX =
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use DAG.getInsertVectorElt

@KavinTheG
Copy link
Contributor Author

  • Widened 128/256 bit vector types on non-VLX targets
  • Used SelectionDAG's helper abstractions (i.e., ISD::INSERT_VECTOR_ELT -> getInsertVectorElt)

@KavinTheG KavinTheG requested a review from RKSimon November 9, 2025 02:07
setOperationAction(ISD::FLDEXP, VT, Custom);

if (Subtarget.hasFP16()) {
for (MVT VT : {MVT::f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we're extending on non-FP16 targets - why not drop the hasFP16 requirement and add it to the default AVX512 type list above? The only tricky one will be MVT::v32f16 which will need splitting / concatenating

I've added better half-vector coverage at #167294

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try giving it a go.

Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
if (XTy.isVector()) {
SDValue WideX = DAG.getInsertSubvector(DL, DAG.getUNDEF(XVT), X, 0);
SDValue WideExp = DAG.getInsertSubvector(DL, DAG.getUNDEF(ExpVT), Exp, 0);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to just use widenSubVector and tell it to widen to 512-bits instead of recomputing XVT/ExptVT

@KavinTheG
Copy link
Contributor Author

  • Added half vector handling, widening to f32 if fp16 is not available
  • Switched to widenSubVector, removing the need for XVT and ExpVT
  • Rebased to
    • include the new tests for half vectors
    • squash previous clang format commits

@KavinTheG KavinTheG requested a review from RKSimon November 12, 2025 19:05
SDValue OpHigh = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, High, ExpHigh);
SDValue ScaledLow = LowerFLDEXP(OpLow, Subtarget, DAG);
SDValue ScaledHigh = LowerFLDEXP(OpHigh, Subtarget, DAG);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v32f16, ScaledLow,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to use splitVectorOp here?

case MVT::v2f64:
if (Subtarget.hasVLX()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why SCALEFS?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mistake on my part, should be SCALEF.

@KavinTheG
Copy link
Contributor Author

Quick question. Given a non-VLX target and vectors v4f32/v2f64, would it be better to return early in this case with SCALEFS instead of extending, since SCALEFS can work on xmm registers? Like so.

  case MVT::v4f32:
  case MVT::v2f64:
    if (Subtarget.hasVLX()) {
      ...         
    }
    Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
    return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X);
    break;

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 13, 2025

No, SCALEFS will only touch element[0] - not the upper elements of the 128-bit vector

@KavinTheG
Copy link
Contributor Author

  • Call SCALEF for v2f64/v4f32
  • Refactored switch statement

@KavinTheG KavinTheG requested a review from RKSimon November 14, 2025 16:56
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[X86] Lower mathlib call ldexp into scalef when avx512 is enabled

3 participants